import pandas as pd
import altair as alt
from vega_datasets import data
import re
import numpy as np
Anna Botsula, CS 4, 2021
Dataset provided by Avocado provider for USA cities during 2015 year - 2018 year (3 months).It includes region name, date, prices and volume of sales. Also amount of sold small and large bags of avodacado per period.
Main goal of this work will be to check if there is some relation between amount of avocados sailed in different regions, months and year.
# Read avocado sales data
df_avocado_raw = pd.read_csv('./avocado.csv')
df_avocado_raw
| Unnamed: 0 | Date | AveragePrice | Total Volume | 4046 | 4225 | 4770 | Total Bags | Small Bags | Large Bags | XLarge Bags | type | year | region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2015-12-27 | 1.33 | 64236.62 | 1036.74 | 54454.85 | 48.16 | 8696.87 | 8603.62 | 93.25 | 0.0 | conventional | 2015 | Albany |
| 1 | 1 | 2015-12-20 | 1.35 | 54876.98 | 674.28 | 44638.81 | 58.33 | 9505.56 | 9408.07 | 97.49 | 0.0 | conventional | 2015 | Albany |
| 2 | 2 | 2015-12-13 | 0.93 | 118220.22 | 794.70 | 109149.67 | 130.50 | 8145.35 | 8042.21 | 103.14 | 0.0 | conventional | 2015 | Albany |
| 3 | 3 | 2015-12-06 | 1.08 | 78992.15 | 1132.00 | 71976.41 | 72.58 | 5811.16 | 5677.40 | 133.76 | 0.0 | conventional | 2015 | Albany |
| 4 | 4 | 2015-11-29 | 1.28 | 51039.60 | 941.48 | 43838.39 | 75.78 | 6183.95 | 5986.26 | 197.69 | 0.0 | conventional | 2015 | Albany |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18244 | 7 | 2018-02-04 | 1.63 | 17074.83 | 2046.96 | 1529.20 | 0.00 | 13498.67 | 13066.82 | 431.85 | 0.0 | organic | 2018 | WestTexNewMexico |
| 18245 | 8 | 2018-01-28 | 1.71 | 13888.04 | 1191.70 | 3431.50 | 0.00 | 9264.84 | 8940.04 | 324.80 | 0.0 | organic | 2018 | WestTexNewMexico |
| 18246 | 9 | 2018-01-21 | 1.87 | 13766.76 | 1191.92 | 2452.79 | 727.94 | 9394.11 | 9351.80 | 42.31 | 0.0 | organic | 2018 | WestTexNewMexico |
| 18247 | 10 | 2018-01-14 | 1.93 | 16205.22 | 1527.63 | 2981.04 | 727.01 | 10969.54 | 10919.54 | 50.00 | 0.0 | organic | 2018 | WestTexNewMexico |
| 18248 | 11 | 2018-01-07 | 1.62 | 17489.58 | 2894.77 | 2356.13 | 224.53 | 12014.15 | 11988.14 | 26.01 | 0.0 | organic | 2018 | WestTexNewMexico |
18249 rows × 14 columns
# Find all regions
regions = df_avocado_raw['region'].unique()
# Prepare regions
new_regions = []
for r in regions:
temp = re.findall('[A-Z][^A-Z]*', r)
new_regions.append(' '.join(temp))
# Change regions in raw df
columns = list(df_avocado_raw.columns)
for i, row in df_avocado_raw.iterrows():
temp = re.findall('[A-Z][^A-Z]*', row['region'])
df_avocado_raw.iloc[i, columns.index('region')] = ' '.join(temp)
# Download data for USA
states = alt.topo_feature(data.us_10m.url, feature='states')
# Prepare background chart of USA to map cities on it
background = alt.Chart(states).mark_geoshape(
fill='lightgray',
stroke='white'
).properties(
).project('albersUsa')
background
# Read cities
df_cities = pd.read_csv('./uscities.csv')
# Create df with selected cities coordinates
selected_cities_df = pd.DataFrame(columns=df_cities.columns)
for r in range(len(new_regions)):
rows = df_cities[df_cities['city_ascii'] == new_regions[r]]
if len(rows) != 0:
selected_cities_df.loc[r] = rows.iloc[0]
# Filter avocado data due to detected cities that we always have city and values for each row
selected_cities_list = selected_cities_df['city'].unique()
months_names = ['January', 'February', 'March', 'April', 'May', 'June', 'July','August','September','October','November','December']
per_population_column = []
months_column = []
month_number = []
for i, row in df_avocado_raw.iterrows():
if (row['region'] not in selected_cities_list):
df_avocado_raw = df_avocado_raw.drop(i)
else:
per_population_column.append( row['Total Volume'] / int(selected_cities_df[selected_cities_df['city'] == row['region']]['population']))
date = row['Date']
months_column.append(months_names[int(date.split('-')[1]) - 1])
month_number.append(int(date.split('-')[1]))
# Add avocado per person coef and month of the year columns
df_avocado_raw.insert(2, "avocado_per_person", per_population_column)
df_avocado_raw.insert(1, "month", months_column)
df_avocado_raw.insert(3, "month_number", month_number)
df_avocado_raw.head()
| Unnamed: 0 | month | month_number | Date | avocado_per_person | AveragePrice | Total Volume | 4046 | 4225 | 4770 | Total Bags | Small Bags | Large Bags | XLarge Bags | type | year | region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | December | 12 | 2015-12-27 | 0.109547 | 1.33 | 64236.62 | 1036.74 | 54454.85 | 48.16 | 8696.87 | 8603.62 | 93.25 | 0.0 | conventional | 2015 | Albany |
| 1 | 1 | December | 12 | 2015-12-20 | 0.093586 | 1.35 | 54876.98 | 674.28 | 44638.81 | 58.33 | 9505.56 | 9408.07 | 97.49 | 0.0 | conventional | 2015 | Albany |
| 2 | 2 | December | 12 | 2015-12-13 | 0.201609 | 0.93 | 118220.22 | 794.70 | 109149.67 | 130.50 | 8145.35 | 8042.21 | 103.14 | 0.0 | conventional | 2015 | Albany |
| 3 | 3 | December | 12 | 2015-12-06 | 0.134711 | 1.08 | 78992.15 | 1132.00 | 71976.41 | 72.58 | 5811.16 | 5677.40 | 133.76 | 0.0 | conventional | 2015 | Albany |
| 4 | 4 | November | 11 | 2015-11-29 | 0.087041 | 1.28 | 51039.60 | 941.48 | 43838.39 | 75.78 | 6183.95 | 5986.26 | 197.69 | 0.0 | conventional | 2015 | Albany |
# Create slider for year selection
input_slider = alt.binding_range(min=df_avocado_raw.year.min(), max=2018, step=1, name='select year:')
select_year = alt.selection_single(name="year", fields = ['year'], bind=input_slider, init = {'year': 2017})
Main goal of next visualisation was to introduce to user locations of observed cities and also give understanding which cities have higher or lower average prices. Also it shows amount of sailed avocados per city. And all of these could be compared by year.
Even we can`t see much difference between the years, this visualisation help us to recognise some differnce between cities and simultaneously compare price to sold amount. And there is no doubts that using map is intuitive and easy way to see locations of regions.
Pros:
Cons:
# Disable max rows for altair
alt.data_transformers.disable_max_rows()
# hover = alt.selection(type='single', on='mouseover', nearest=True,
# fields=['lat', 'lng'])
base = alt.Chart(df_avocado_raw).transform_lookup(
lookup = 'region',
from_ = alt.LookupData(data = selected_cities_df,
key = 'city',
fields=['lat', 'lng'])
).properties(
title = alt.TitleParams(
text = 'Average price per avocado in $ / Avarage total amount of sales per city',
fontSize=18, dx=100, dy=-40)
).encode(
longitude='lng:Q',
latitude='lat:Q',
)
points = base.mark_point(filled = True).encode(
color=alt.Color('mean(AveragePrice):Q', scale = alt.Scale(
type = 'quantize', scheme = 'greens')),
size=alt.Size('mean(Total Volume):Q',
scale = alt.Scale(range = [50, 2000], domain = [df_avocado_raw['Total Volume'].min(), df_avocado_raw['Total Volume'].mean()]),
),
tooltip = [
alt.Tooltip('region:N'),
alt.Tooltip(f'mean(Total Volume):Q'),
alt.Tooltip(f'mean(AveragePrice):Q'),
]
).add_selection(select_year
).transform_filter(select_year
)
alt.layer(background, points).configure_view(strokeWidth = 0, width = 950, height = 500).configure_legend(padding = 25)
# During data analysing, I found that there are 3 regions which give NaN values for Bags.
# So we should remove them.
df_avocado_no_outliers = df_avocado_raw[df_avocado_raw['region'] != 'California'][df_avocado_raw['region'] != 'Plains'][df_avocado_raw['region'] != 'West']
# Data preprocessing
for i in range(1):
quan_small = np.percentile(df_avocado_no_outliers['Small Bags'], 50)
quan_large = np.percentile(df_avocado_no_outliers['Large Bags'], 20)
for i, row in df_avocado_no_outliers.iterrows():
if (row['Small Bags'] < quan_small or row['Large Bags'] < quan_large):
df_avocado_no_outliers = df_avocado_no_outliers.drop(i)
quan_small = np.percentile(df_avocado_no_outliers['Small Bags'], 30)
quan_large = np.percentile(df_avocado_no_outliers['Large Bags'], 10)
for i, row in df_avocado_no_outliers.iterrows():
if (row['Small Bags'] < quan_small or row['Large Bags'] < quan_large):
df_avocado_no_outliers = df_avocado_no_outliers.drop(i)
<ipython-input-11-323cc80c7a6a>:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index. df_avocado_no_outliers = df_avocado_raw[df_avocado_raw['region'] != 'California'][df_avocado_raw['region'] != 'Plains'][df_avocado_raw['region'] != 'West']
The idea of next charts is to compare amount of Small and Large bags of Avocado sold in different regions through monthes and years.
This method is best to see the whole picture at a time, compare between regions and year time. It also gives oppurtunity to easily recognise some pattern through the year, could be useful for analytical predictions for Avocado marketers.
Pros:
Cons:
small_bags = alt.Chart(df_avocado_no_outliers).mark_rect().encode(
x = alt.X('month:N', title='Month of year', sort=alt.Sort(field='month_number', op='max', order='ascending')),
y = alt.Y('region:N', title='Regions'),
color = alt.Color('Small Bags:Q', scale = alt.Scale(scheme = 'greens', domain=[0, 200000])),
tooltip = [
alt.Tooltip('region:N'),
alt.Tooltip('Small Bags:Q'),
alt.Tooltip('month:N'),
]
).add_selection(select_year
).transform_filter(select_year
).properties(width = 450, height = 400,
title=alt.TitleParams(
text = 'Amount of Small avocado bags bought in different regions per selected Year.',
fontSize=14, dx=70, dy=-40)
)
large_bags = alt.Chart(df_avocado_no_outliers).mark_rect().encode(
# x = alt.X('month:N', title='Month of year'),
x = alt.X('month:N', title='Month of year', sort=alt.Sort(field='month_number', op='max', order='ascending')),
y = alt.Y('region:N', title='Regions'),
color = alt.Color('Large Bags:Q', scale = alt.Scale(scheme = 'orangered')),
tooltip = [
alt.Tooltip('region:N'),
alt.Tooltip('Small Bags:Q'),
alt.Tooltip('month:N'),
]
).add_selection(select_year
).transform_filter(select_year
).properties(width = 450, height = 400,
title=alt.TitleParams(
text = 'Amount of Large Avocado bags bought in different regions per selected Year.',
fontSize=14, dx=70, dy=-40)
)
alt.hconcat(small_bags.properties(width = 550, height = 400),
large_bags.properties(width = 550, height = 400)
).add_selection(select_year).properties(background = '#F9F9F9', padding = 15,
title = alt.TitleParams(
text = 'Comparison of Small and Large avocado bags demand due to month, year and region.', dx=350, dy=-30
)).configure_title(
anchor = 'start',
# frame = 'group',
fontSize = 18,
subtitleFontSize = 16
)
Next visualisation can give visual understanding of consumption of Avocadoes per person in different regions due to month of the year. To have this value I devided population by the total amount of sold avocadoes for each region. As we have same sources of data for each region, those values should be proportionaly convinient.
This method is both usual and easy to understand, but also show some really important information (but not the location, for this we can use 1 Chart :). I decided not to sort regions by this value to make comparison between the months easier, so they don`t move.
Pros:
Cons:
input_slider_month = alt.binding_select(options=months_names, name='Select Month:')
select_month = alt.selection_single(name="month", fields = ['month'], bind=input_slider_month, init = {'month': 'December'})
df_avocado_no_outliers = df_avocado_raw[df_avocado_raw['region'] != 'California'][df_avocado_raw['region'] != 'Plains'][df_avocado_raw['region'] != 'West']
alt.Chart(df_avocado_no_outliers).mark_bar().encode(
y = alt.Y('avocado_per_person:Q', scale=alt.Scale(domain = [df_avocado_no_outliers['avocado_per_person'].min(), df_avocado_no_outliers['avocado_per_person'].max()]), title="Avocado per person"),
# x = alt.X('region:N', sort=alt.Sort(field='avocado_per_person', op='max', order='descending')),
x = alt.X('region:N', title="Regions"),
color = alt.value('green')
).add_selection(select_month
).transform_filter(select_month
).properties(width = 850, height = 400, padding=25,
title=alt.TitleParams(
text = 'Avocado per person average amount (all years average) specificaly for regions and months',
fontSize=18, dx=10, dy=-40)
)
<ipython-input-21-e3da3ff61493>:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index. df_avocado_no_outliers = df_avocado_raw[df_avocado_raw['region'] != 'California'][df_avocado_raw['region'] != 'Plains'][df_avocado_raw['region'] != 'West']